library(data.table)
library(tidyverse)
epa_2004 = fread("epa_2004.csv")
epa_2019 = fread("epa_2019.csv")
Now we can check the dimensions of each dataset.
# Check dimensions
dim(epa_2004)
## [1] 19233 20
# 19233 rows, 20 columns
dim(epa_2019)
## [1] 53156 20
# 53156 rows, 20 columns
The 2004 dataset has 19233 rows and 20 columns, whereas the 2019 dataset has 53156 rows and 20 columns.
# Check headers and footers
head(epa_2004)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 01/01/2004 AQS 60010007 1 8.9 ug/m3 LC
## 2: 01/02/2004 AQS 60010007 1 12.2 ug/m3 LC
## 3: 01/03/2004 AQS 60010007 1 16.5 ug/m3 LC
## 4: 01/04/2004 AQS 60010007 1 18.1 ug/m3 LC
## 5: 01/05/2004 AQS 60010007 1 11.5 ug/m3 LC
## 6: 01/06/2004 AQS 60010007 1 32.5 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 37 Livermore 1 100
## 2: 51 Livermore 1 100
## 3: 60 Livermore 1 100
## 4: 64 Livermore 1 100
## 5: 48 Livermore 1 100
## 6: 94 Livermore 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 41860
## 2: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 3: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 4: 88101 PM2.5 - Local Conditions 41860
## 5: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 6: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
## 1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## SITE_LATITUDE SITE_LONGITUDE
## 1: 37.68753 -121.7842
## 2: 37.68753 -121.7842
## 3: 37.68753 -121.7842
## 4: 37.68753 -121.7842
## 5: 37.68753 -121.7842
## 6: 37.68753 -121.7842
head(epa_2019)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 01/01/2019 AQS 60010007 3 5.7 ug/m3 LC
## 2: 01/02/2019 AQS 60010007 3 11.9 ug/m3 LC
## 3: 01/03/2019 AQS 60010007 3 20.1 ug/m3 LC
## 4: 01/04/2019 AQS 60010007 3 28.8 ug/m3 LC
## 5: 01/05/2019 AQS 60010007 3 11.2 ug/m3 LC
## 6: 01/06/2019 AQS 60010007 3 2.7 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 24 Livermore 1 100
## 2: 50 Livermore 1 100
## 3: 68 Livermore 1 100
## 4: 86 Livermore 1 100
## 5: 47 Livermore 1 100
## 6: 11 Livermore 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 41860
## 2: 88101 PM2.5 - Local Conditions 41860
## 3: 88101 PM2.5 - Local Conditions 41860
## 4: 88101 PM2.5 - Local Conditions 41860
## 5: 88101 PM2.5 - Local Conditions 41860
## 6: 88101 PM2.5 - Local Conditions 41860
## CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
## 1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## SITE_LATITUDE SITE_LONGITUDE
## 1: 37.68753 -121.7842
## 2: 37.68753 -121.7842
## 3: 37.68753 -121.7842
## 4: 37.68753 -121.7842
## 5: 37.68753 -121.7842
## 6: 37.68753 -121.7842
tail(epa_2004)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 12/14/2004 AQS 61131003 1 11 ug/m3 LC
## 2: 12/17/2004 AQS 61131003 1 16 ug/m3 LC
## 3: 12/20/2004 AQS 61131003 1 17 ug/m3 LC
## 4: 12/23/2004 AQS 61131003 1 9 ug/m3 LC
## 5: 12/26/2004 AQS 61131003 1 24 ug/m3 LC
## 6: 12/29/2004 AQS 61131003 1 9 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 46 Woodland-Gibson Road 1 100
## 2: 59 Woodland-Gibson Road 1 100
## 3: 61 Woodland-Gibson Road 1 100
## 4: 38 Woodland-Gibson Road 1 100
## 5: 76 Woodland-Gibson Road 1 100
## 6: 38 Woodland-Gibson Road 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 40900
## 2: 88101 PM2.5 - Local Conditions 40900
## 3: 88101 PM2.5 - Local Conditions 40900
## 4: 88101 PM2.5 - Local Conditions 40900
## 5: 88101 PM2.5 - Local Conditions 40900
## 6: 88101 PM2.5 - Local Conditions 40900
## CBSA_NAME STATE_CODE STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1: Yolo 38.66121 -121.7327
## 2: Yolo 38.66121 -121.7327
## 3: Yolo 38.66121 -121.7327
## 4: Yolo 38.66121 -121.7327
## 5: Yolo 38.66121 -121.7327
## 6: Yolo 38.66121 -121.7327
tail(epa_2019)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 11/11/2019 AQS 61131003 1 13.5 ug/m3 LC
## 2: 11/17/2019 AQS 61131003 1 18.1 ug/m3 LC
## 3: 11/29/2019 AQS 61131003 1 12.5 ug/m3 LC
## 4: 12/17/2019 AQS 61131003 1 23.8 ug/m3 LC
## 5: 12/23/2019 AQS 61131003 1 1.0 ug/m3 LC
## 6: 12/29/2019 AQS 61131003 1 9.1 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 54 Woodland-Gibson Road 1 100
## 2: 64 Woodland-Gibson Road 1 100
## 3: 52 Woodland-Gibson Road 1 100
## 4: 76 Woodland-Gibson Road 1 100
## 5: 4 Woodland-Gibson Road 1 100
## 6: 38 Woodland-Gibson Road 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 40900
## 2: 88101 PM2.5 - Local Conditions 40900
## 3: 88101 PM2.5 - Local Conditions 40900
## 4: 88101 PM2.5 - Local Conditions 40900
## 5: 88101 PM2.5 - Local Conditions 40900
## 6: 88101 PM2.5 - Local Conditions 40900
## CBSA_NAME STATE_CODE STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1: Yolo 38.66121 -121.7327
## 2: Yolo 38.66121 -121.7327
## 3: Yolo 38.66121 -121.7327
## 4: Yolo 38.66121 -121.7327
## 5: Yolo 38.66121 -121.7327
## 6: Yolo 38.66121 -121.7327
# Check variable names
colnames(epa_2004)
## [1] "Date" "Source"
## [3] "Site ID" "POC"
## [5] "Daily Mean PM2.5 Concentration" "UNITS"
## [7] "DAILY_AQI_VALUE" "Site Name"
## [9] "DAILY_OBS_COUNT" "PERCENT_COMPLETE"
## [11] "AQS_PARAMETER_CODE" "AQS_PARAMETER_DESC"
## [13] "CBSA_CODE" "CBSA_NAME"
## [15] "STATE_CODE" "STATE"
## [17] "COUNTY_CODE" "COUNTY"
## [19] "SITE_LATITUDE" "SITE_LONGITUDE"
colnames(epa_2019)
## [1] "Date" "Source"
## [3] "Site ID" "POC"
## [5] "Daily Mean PM2.5 Concentration" "UNITS"
## [7] "DAILY_AQI_VALUE" "Site Name"
## [9] "DAILY_OBS_COUNT" "PERCENT_COMPLETE"
## [11] "AQS_PARAMETER_CODE" "AQS_PARAMETER_DESC"
## [13] "CBSA_CODE" "CBSA_NAME"
## [15] "STATE_CODE" "STATE"
## [17] "COUNTY_CODE" "COUNTY"
## [19] "SITE_LATITUDE" "SITE_LONGITUDE"
The column names, or variables, for both datasets are the same and are of the same data type.
# Check variable types
str(epa_2004)
## Classes 'data.table' and 'data.frame': 19233 obs. of 20 variables:
## $ Date : chr "01/01/2004" "01/02/2004" "01/03/2004" "01/04/2004" ...
## $ Source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ POC : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Daily Mean PM2.5 Concentration: num 8.9 12.2 16.5 18.1 11.5 32.5 14 29.9 21 16.9 ...
## $ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ DAILY_AQI_VALUE : int 37 51 60 64 48 94 55 88 70 61 ...
## $ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS_PARAMETER_CODE : int 88101 88502 88502 88101 88502 88502 88101 88502 88502 88502 ...
## $ AQS_PARAMETER_DESC : chr "PM2.5 - Local Conditions" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" "PM2.5 - Local Conditions" ...
## $ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
## $ STATE : chr "California" "California" "California" "California" ...
## $ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
## $ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
str(epa_2019)
## Classes 'data.table' and 'data.frame': 53156 obs. of 20 variables:
## $ Date : chr "01/01/2019" "01/02/2019" "01/03/2019" "01/04/2019" ...
## $ Source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ POC : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Daily Mean PM2.5 Concentration: num 5.7 11.9 20.1 28.8 11.2 2.7 2.8 7 3.1 7.1 ...
## $ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ DAILY_AQI_VALUE : int 24 50 68 86 47 11 12 29 13 30 ...
## $ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS_PARAMETER_CODE : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
## $ AQS_PARAMETER_DESC : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
## $ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
## $ STATE : chr "California" "California" "California" "California" ...
## $ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
## $ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(epa_2004)
## Date Source Site ID POC
## Length:19233 Length:19233 Min. :60010007 Min. : 1.000
## Class :character Class :character 1st Qu.:60370002 1st Qu.: 1.000
## Mode :character Mode :character Median :60658001 Median : 1.000
## Mean :60588026 Mean : 1.816
## 3rd Qu.:60750006 3rd Qu.: 2.000
## Max. :61131003 Max. :12.000
##
## Daily Mean PM2.5 Concentration UNITS DAILY_AQI_VALUE
## Min. : -0.10 Length:19233 Min. : 0.00
## 1st Qu.: 6.00 Class :character 1st Qu.: 25.00
## Median : 10.10 Mode :character Median : 42.00
## Mean : 13.13 Mean : 46.33
## 3rd Qu.: 16.30 3rd Qu.: 60.00
## Max. :251.00 Max. :301.00
##
## Site Name DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE
## Length:19233 Min. :1 Min. :100 Min. :88101
## Class :character 1st Qu.:1 1st Qu.:100 1st Qu.:88101
## Mode :character Median :1 Median :100 Median :88101
## Mean :1 Mean :100 Mean :88267
## 3rd Qu.:1 3rd Qu.:100 3rd Qu.:88502
## Max. :1 Max. :100 Max. :88502
##
## AQS_PARAMETER_DESC CBSA_CODE CBSA_NAME STATE_CODE
## Length:19233 Min. :12540 Length:19233 Min. :6
## Class :character 1st Qu.:31080 Class :character 1st Qu.:6
## Mode :character Median :40140 Mode :character Median :6
## Mean :35328 Mean :6
## 3rd Qu.:41860 3rd Qu.:6
## Max. :49700 Max. :6
## NA's :1253
## STATE COUNTY_CODE COUNTY SITE_LATITUDE
## Length:19233 Min. : 1.00 Length:19233 Min. :32.63
## Class :character 1st Qu.: 37.00 Class :character 1st Qu.:34.07
## Mode :character Median : 65.00 Mode :character Median :36.48
## Mean : 58.63 Mean :36.23
## 3rd Qu.: 75.00 3rd Qu.:38.10
## Max. :113.00 Max. :41.71
##
## SITE_LONGITUDE
## Min. :-124.2
## 1st Qu.:-121.6
## Median :-119.3
## Mean :-119.7
## 3rd Qu.:-117.9
## Max. :-115.5
##
summary(epa_2019)
## Date Source Site ID POC
## Length:53156 Length:53156 Min. :60010007 Min. : 1.000
## Class :character Class :character 1st Qu.:60310004 1st Qu.: 1.000
## Mode :character Mode :character Median :60612003 Median : 3.000
## Mean :60565264 Mean : 2.573
## 3rd Qu.:60771002 3rd Qu.: 3.000
## Max. :61131003 Max. :21.000
##
## Daily Mean PM2.5 Concentration UNITS DAILY_AQI_VALUE
## Min. : -2.20 Length:53156 Min. : 0.00
## 1st Qu.: 4.00 Class :character 1st Qu.: 17.00
## Median : 6.50 Mode :character Median : 27.00
## Mean : 7.74 Mean : 30.58
## 3rd Qu.: 9.90 3rd Qu.: 41.00
## Max. :120.90 Max. :185.00
##
## Site Name DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE
## Length:53156 Min. :1 Min. :100 Min. :88101
## Class :character 1st Qu.:1 1st Qu.:100 1st Qu.:88101
## Mode :character Median :1 Median :100 Median :88101
## Mean :1 Mean :100 Mean :88214
## 3rd Qu.:1 3rd Qu.:100 3rd Qu.:88502
## Max. :1 Max. :100 Max. :88502
##
## AQS_PARAMETER_DESC CBSA_CODE CBSA_NAME STATE_CODE
## Length:53156 Min. :12540 Length:53156 Min. :6
## Class :character 1st Qu.:31080 Class :character 1st Qu.:6
## Mode :character Median :40140 Mode :character Median :6
## Mean :35839 Mean :6
## 3rd Qu.:41860 3rd Qu.:6
## Max. :49700 Max. :6
## NA's :4181
## STATE COUNTY_CODE COUNTY SITE_LATITUDE
## Length:53156 Min. : 1.00 Length:53156 Min. :32.58
## Class :character 1st Qu.: 31.00 Class :character 1st Qu.:34.14
## Mode :character Median : 61.00 Mode :character Median :36.63
## Mean : 56.38 Mean :36.34
## 3rd Qu.: 77.00 3rd Qu.:37.97
## Max. :113.00 Max. :41.76
##
## SITE_LONGITUDE
## Min. :-124.2
## 1st Qu.:-121.6
## Median :-119.8
## Mean :-119.8
## 3rd Qu.:-118.1
## Max. :-115.5
##
We can then check variables of interest. To answer our question, this would be the daily mean PM2.5 concentration.
# Check variable of interest - daily mean PM2.5 concentration
table(epa_2004$`Daily Mean PM2.5 Concentration`)
##
## -0.1 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1 1.1
## 1 11 15 20 27 32 35 41 44 40 31 94 39
## 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1 2.2 2.3 2.4
## 41 37 32 30 44 36 38 35 113 59 44 40 46
## 2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3 3.4 3.5 3.6 3.7
## 53 61 48 57 65 198 61 64 88 53 74 80 78
## 3.8 3.9 4 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5
## 70 59 365 71 75 72 80 94 60 73 86 80 425
## 5.1 5.2 5.3 5.4 5.5 5.6 5.7 5.8 5.9 6 6.1 6.2 6.3
## 82 92 97 84 102 72 95 90 106 426 85 95 96
## 6.4 6.5 6.6 6.7 6.8 6.9 7 7.1 7.2 7.3 7.4 7.5 7.6
## 86 104 66 95 81 87 413 87 101 82 91 104 63
## 7.7 7.8 7.9 8 8.1 8.2 8.3 8.4 8.5 8.6 8.7 8.8 8.9
## 123 100 83 406 73 113 85 97 111 86 96 94 91
## 9 9.1 9.2 9.3 9.4 9.5 9.6 9.7 9.8 9.9 10 10.1 10.2
## 358 83 104 82 86 99 67 107 89 73 276 97 100
## 10.3 10.4 10.5 10.6 10.7 10.8 10.9 11 11.1 11.2 11.3 11.4 11.5
## 86 70 109 79 96 85 91 266 67 86 94 71 85
## 11.6 11.7 11.8 11.9 12 12.1 12.2 12.3 12.4 12.5 12.6 12.7 12.8
## 74 74 73 60 229 74 75 70 77 74 82 67 70
## 12.9 13 13.1 13.2 13.3 13.4 13.5 13.6 13.7 13.8 13.9 14 14.1
## 55 165 67 74 61 55 70 65 58 60 49 138 65
## 14.2 14.3 14.4 14.5 14.6 14.7 14.8 14.9 15 15.1 15.2 15.3 15.4
## 64 56 51 46 52 70 49 58 138 53 58 58 51
## 15.5 15.6 15.7 15.8 15.9 16 16.1 16.2 16.3 16.4 16.5 16.6 16.7
## 52 46 48 44 53 125 35 49 47 50 50 40 40
## 16.8 16.9 17 17.1 17.2 17.3 17.4 17.5 17.6 17.7 17.8 17.9 18
## 46 32 98 39 36 33 46 42 34 38 39 25 86
## 18.1 18.2 18.3 18.4 18.5 18.6 18.7 18.8 18.9 19 19.1 19.2 19.3
## 36 38 24 17 35 42 32 23 28 70 30 19 30
## 19.4 19.5 19.6 19.7 19.8 19.9 20 20.1 20.2 20.3 20.4 20.5 20.6
## 20 42 41 22 17 29 84 27 32 29 20 25 25
## 20.7 20.8 20.9 21 21.1 21.2 21.3 21.4 21.5 21.6 21.7 21.8 21.9
## 23 21 19 64 22 16 16 15 22 18 19 15 11
## 22 22.1 22.2 22.3 22.4 22.5 22.6 22.7 22.8 22.9 23 23.1 23.2
## 39 27 16 18 14 23 17 20 18 15 55 22 12
## 23.3 23.4 23.5 23.6 23.7 23.8 23.9 24 24.1 24.2 24.3 24.4 24.5
## 22 20 15 18 16 20 12 47 19 18 13 6 17
## 24.6 24.7 24.8 24.9 25 25.1 25.2 25.3 25.4 25.5 25.6 25.7 25.8
## 12 14 12 14 54 16 14 14 12 16 14 18 8
## 25.9 26 26.1 26.2 26.3 26.4 26.5 26.6 26.7 26.8 26.9 27 27.1
## 12 50 15 8 13 13 14 16 10 8 12 43 14
## 27.2 27.3 27.4 27.5 27.6 27.7 27.8 27.9 28 28.1 28.2 28.3 28.4
## 23 8 4 13 7 6 11 9 37 22 20 13 10
## 28.5 28.6 28.7 28.8 28.9 29 29.1 29.2 29.3 29.4 29.5 29.6 29.7
## 9 11 8 7 17 30 13 10 12 6 16 20 12
## 29.8 29.9 30 30.1 30.2 30.3 30.4 30.5 30.6 30.7 30.8 30.9 31
## 11 10 39 14 17 12 9 12 11 12 10 5 29
## 31.1 31.2 31.3 31.4 31.5 31.7 31.8 31.9 32 32.1 32.2 32.3 32.4
## 6 15 4 10 8 8 17 8 27 7 12 12 9
## 32.5 32.6 32.7 32.8 32.9 33 33.1 33.2 33.3 33.4 33.5 33.6 33.7
## 10 9 8 11 10 31 4 4 10 13 11 6 12
## 33.8 33.9 34 34.1 34.2 34.3 34.4 34.5 34.6 34.7 34.8 34.9 35
## 6 4 34 4 7 3 11 7 7 4 4 9 23
## 35.1 35.2 35.3 35.4 35.5 35.6 35.7 35.8 35.9 36 36.1 36.2 36.3
## 6 15 6 5 8 6 4 3 2 23 6 9 3
## 36.4 36.5 36.6 36.7 36.8 36.9 37 37.1 37.2 37.3 37.4 37.5 37.6
## 6 9 7 8 7 5 19 5 5 8 10 4 5
## 37.7 37.8 37.9 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9
## 4 7 3 24 4 8 4 7 9 3 6 5 3
## 39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 40 40.1 40.2
## 16 7 6 3 6 4 6 6 3 4 7 8 6
## 40.3 40.4 40.5 40.6 40.7 40.8 40.9 41 41.1 41.2 41.3 41.4 41.5
## 4 8 11 1 3 7 3 12 5 7 5 2 9
## 41.6 41.7 41.8 41.9 42 42.1 42.2 42.3 42.4 42.5 42.6 42.7 42.9
## 3 5 3 5 9 5 6 6 5 4 5 5 8
## 43 43.1 43.2 43.3 43.4 43.5 43.6 43.7 43.8 43.9 44 44.1 44.2
## 11 5 6 6 6 3 2 5 1 3 12 5 3
## 44.3 44.4 44.5 44.6 44.7 44.8 44.9 45 45.1 45.2 45.3 45.4 45.5
## 5 2 2 6 4 3 6 6 1 1 1 4 4
## 45.7 45.8 45.9 46 46.1 46.2 46.3 46.4 46.5 46.6 46.7 46.8 46.9
## 7 4 3 3 3 2 1 3 7 1 4 1 5
## 47 47.1 47.2 47.3 47.5 47.6 47.7 47.8 47.9 48 48.1 48.2 48.3
## 7 2 5 2 2 3 3 4 2 3 1 2 2
## 48.4 48.5 48.6 48.7 48.9 49 49.2 49.3 49.4 49.5 49.6 49.7 49.9
## 1 3 1 4 2 5 1 3 5 1 2 5 1
## 50 50.1 50.2 50.4 50.5 50.6 50.8 50.9 51 51.2 51.4 51.5 51.7
## 4 4 1 2 1 3 1 2 5 4 1 3 2
## 51.8 51.9 52 52.1 52.2 52.4 52.5 52.7 52.8 52.9 53 53.1 53.2
## 1 1 4 2 1 2 3 1 1 3 5 1 2
## 53.3 53.5 53.7 53.8 53.9 54 54.2 54.3 54.4 54.6 54.8 54.9 55
## 1 1 1 3 1 2 2 2 1 2 2 1 3
## 55.1 55.2 55.3 55.5 55.6 55.7 55.8 56 56.1 56.2 56.3 56.4 56.8
## 1 1 1 2 1 1 3 1 1 2 1 3 1
## 57 57.2 57.3 57.4 57.9 58.1 58.4 58.7 58.9 59.1 59.2 59.3 59.4
## 4 1 3 1 1 1 2 1 2 1 2 1 1
## 59.5 59.7 59.9 60 60.1 60.3 60.4 60.5 60.7 60.8 60.9 61 61.2
## 2 2 2 2 1 1 1 1 1 2 1 3 1
## 61.5 61.7 61.8 62.5 62.6 62.7 63.1 63.4 63.9 64 64.9 65 65.3
## 1 1 2 2 1 1 1 1 1 1 1 2 1
## 65.4 66.1 66.3 66.6 67.1 67.3 67.4 68.2 68.6 68.7 68.9 69 69.3
## 2 3 2 2 1 1 3 1 1 1 1 2 1
## 70 70.6 71 71.4 72.4 72.8 73.6 73.7 74.2 74.5 75 75.6 76.8
## 1 2 1 2 1 2 2 1 1 1 1 1 1
## 77.1 77.5 79.8 80.9 81 81.4 81.6 81.9 82.3 83 86.1 90.2 90.7
## 1 1 1 1 1 1 2 1 1 2 1 1 1
## 90.9 91.7 93.4 93.8 95.7 100.4 102.1 110.4 122.5 148.4 170.4 251
## 1 1 1 1 1 1 1 1 1 1 1 1
summary(epa_2004$`Daily Mean PM2.5 Concentration`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.10 6.00 10.10 13.13 16.30 251.00
table(epa_2019$`Daily Mean PM2.5 Concentration`)
##
## -2.2 -2 -1.9 -1.8 -1.7 -1.6 -1.5 -1.4 -1.3 -1.2 -1.1 -1 -0.9
## 1 12 16 11 12 12 8 11 10 16 9 12 10
## -0.8 -0.7 -0.6 -0.5 -0.4 -0.3 -0.2 -0.1 0 0.1 0.2 0.3 0.4
## 11 15 9 15 13 24 29 26 70 39 69 83 83
## 0.5 0.6 0.7 0.8 0.9 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7
## 126 135 159 144 160 358 187 213 206 223 295 235 332
## 1.8 1.9 2 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3
## 273 286 450 272 410 366 308 479 384 471 395 398 564
## 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4 4.1 4.2 4.3
## 402 528 459 400 588 458 591 454 470 719 468 601 506
## 4.4 4.5 4.6 4.7 4.8 4.9 5 5.1 5.2 5.3 5.4 5.5 5.6
## 493 616 520 619 505 478 726 472 603 506 492 596 486
## 5.7 5.8 5.9 6 6.1 6.2 6.3 6.4 6.5 6.6 6.7 6.8 6.9
## 603 491 452 578 445 537 460 452 577 397 548 431 425
## 7 7.1 7.2 7.3 7.4 7.5 7.6 7.7 7.8 7.9 8 8.1 8.2
## 540 373 446 411 420 488 399 488 389 372 487 335 407
## 8.3 8.4 8.5 8.6 8.7 8.8 8.9 9 9.1 9.2 9.3 9.4 9.5
## 373 307 456 320 392 328 306 428 293 384 312 307 388
## 9.6 9.7 9.8 9.9 10 10.1 10.2 10.3 10.4 10.5 10.6 10.7 10.8
## 257 337 297 251 338 228 316 270 254 308 232 255 244
## 10.9 11 11.1 11.2 11.3 11.4 11.5 11.6 11.7 11.8 11.9 12 12.1
## 206 250 224 260 187 202 232 197 227 153 140 234 141
## 12.2 12.3 12.4 12.5 12.6 12.7 12.8 12.9 13 13.1 13.2 13.3 13.4
## 179 157 172 180 142 169 121 132 168 134 153 139 111
## 13.5 13.6 13.7 13.8 13.9 14 14.1 14.2 14.3 14.4 14.5 14.6 14.7
## 158 125 141 102 118 133 82 128 83 111 107 88 106
## 14.8 14.9 15 15.1 15.2 15.3 15.4 15.5 15.6 15.7 15.8 15.9 16
## 90 82 124 84 114 67 90 88 72 86 73 69 71
## 16.1 16.2 16.3 16.4 16.5 16.6 16.7 16.8 16.9 17 17.1 17.2 17.3
## 73 75 51 55 68 60 60 53 38 75 52 54 36
## 17.4 17.5 17.6 17.7 17.8 17.9 18 18.1 18.2 18.3 18.4 18.5 18.6
## 40 54 40 51 42 44 34 42 46 38 30 52 41
## 18.7 18.8 18.9 19 19.1 19.2 19.3 19.4 19.5 19.6 19.7 19.8 19.9
## 36 35 35 36 31 34 34 40 43 32 29 26 31
## 20 20.1 20.2 20.3 20.4 20.5 20.6 20.7 20.8 20.9 21 21.1 21.2
## 41 29 28 18 26 24 21 39 29 20 40 20 26
## 21.3 21.4 21.5 21.6 21.7 21.8 21.9 22 22.1 22.2 22.3 22.4 22.5
## 16 22 21 11 14 17 10 33 21 20 14 16 24
## 22.6 22.7 22.8 22.9 23 23.1 23.2 23.3 23.4 23.5 23.6 23.7 23.8
## 19 12 16 11 20 14 18 17 19 21 11 5 11
## 23.9 24 24.1 24.2 24.3 24.4 24.5 24.6 24.7 24.8 24.9 25 25.1
## 16 12 12 12 10 9 11 15 19 11 10 11 10
## 25.2 25.3 25.4 25.5 25.6 25.7 25.8 25.9 26 26.1 26.2 26.3 26.4
## 6 5 12 15 3 11 10 6 11 9 8 10 9
## 26.5 26.6 26.7 26.8 26.9 27 27.1 27.2 27.3 27.4 27.5 27.6 27.7
## 10 9 10 9 5 12 6 11 8 14 4 8 6
## 27.8 27.9 28 28.1 28.2 28.3 28.4 28.5 28.6 28.7 28.8 28.9 29
## 9 2 15 9 11 8 3 9 6 10 5 5 6
## 29.1 29.2 29.3 29.4 29.5 29.6 29.7 29.8 29.9 30 30.1 30.2 30.3
## 6 6 7 6 8 10 11 8 7 11 2 4 3
## 30.4 30.5 30.6 30.7 30.8 30.9 31 31.1 31.2 31.3 31.4 31.5 31.6
## 4 13 12 8 7 10 7 9 10 5 6 8 5
## 31.7 31.8 31.9 32 32.1 32.2 32.3 32.4 32.5 32.6 32.7 32.8 32.9
## 7 2 4 3 3 5 4 3 5 3 7 7 5
## 33 33.1 33.2 33.3 33.4 33.5 33.6 33.7 33.8 33.9 34 34.1 34.2
## 3 8 8 2 4 6 2 4 5 4 2 3 5
## 34.3 34.4 34.5 34.6 34.7 34.8 34.9 35 35.1 35.2 35.3 35.4 35.5
## 1 4 4 4 3 5 2 1 4 1 3 3 4
## 35.6 35.7 35.8 35.9 36 36.1 36.2 36.3 36.4 36.7 36.8 36.9 37
## 3 2 2 3 3 2 5 6 7 2 2 3 1
## 37.1 37.2 37.3 37.4 37.5 37.6 37.7 37.8 37.9 38 38.1 38.3 38.4
## 7 3 1 1 3 1 1 3 3 1 2 2 2
## 38.5 38.6 38.7 38.9 39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8
## 2 3 1 2 4 2 3 1 1 5 2 3 1
## 39.9 40 40.1 40.2 40.3 40.4 40.5 40.6 40.7 40.9 41 41.1 41.2
## 2 2 3 2 4 1 1 1 3 5 1 4 2
## 41.3 41.4 41.5 41.6 41.7 41.8 41.9 42.2 42.3 42.8 43.1 43.3 43.4
## 2 3 1 2 1 1 1 1 1 1 2 1 3
## 43.5 43.6 44 44.2 44.3 44.4 44.5 44.7 44.8 45.1 45.3 45.4 45.5
## 1 1 1 2 2 1 1 1 1 1 1 1 1
## 45.7 45.8 46 46.3 46.4 46.5 46.7 47.1 47.2 47.4 47.5 47.9 48
## 1 1 1 1 4 1 3 3 1 1 1 1 1
## 48.1 48.2 48.8 49 49.3 49.4 49.6 50.1 50.2 50.6 50.7 50.9 51.3
## 1 1 1 1 1 2 1 1 1 2 2 2 1
## 52.3 52.4 53 53.1 54.7 55.7 57 57.6 57.7 58.2 58.8 59.1 60.4
## 1 1 2 2 1 1 1 2 1 1 1 1 1
## 60.5 62.2 62.6 63.4 66.1 68.4 68.5 70.1 70.3 71.2 73.9 75.1 77.3
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 77.4 81.3 83.7 90.7 91.1 97.3 103.5 104.5 120.9
## 1 1 1 1 1 1 1 1 1
summary(epa_2019$`Daily Mean PM2.5 Concentration`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.20 4.00 6.50 7.74 9.90 120.90
From the summary and tables above, we can see some values below 0 in both datasets, as well as one very large value in the 2004 dataset. We can extract them for closer analysis.
epa_2004$`Site Name`[epa_2004$`Daily Mean PM2.5 Concentration` == 251]
## [1] "Yosemite NP-Yosemite Village Vistor Center"
epa_2004$`Site Name`[epa_2004$`Daily Mean PM2.5 Concentration` == -0.1]
## [1] "Kaiser Wilderness"
sum(epa_2019$`Daily Mean PM2.5 Concentration` < 0)
## [1] 282
epa_2019 %>% filter(`Daily Mean PM2.5 Concentration` < 0) %>% count(`Site Name`) %>% arrange(desc(n))
## Site Name n
## 1: Tahoe City-Fairway Drive 153
## 2: Lake Elsinore 30
## 3: Lompoc H Street 12
## 4: Arroyo Grande CDF 9
## 5: Piru - Pacific 5
## 6: Ridgecrest-Ward 5
## 7: Salinas 3 5
## 8: Lebec 4
## 9: Red Bluff-Walnut St. District Office 4
## 10: Table Mountain Air Monitoring Site 4
## 11: Ukiah-Library 4
## 12: Lancaster-Division Street 3
## 13: Pechanga 3
## 14: Atascadero 2
## 15: Camp Pendleton 2
## 16: Chico-East Avenue 2
## 17: Colfax-City Hall 2
## 18: Concord 2
## 19: Folsom-Natoma St. 2
## 20: Goleta 2
## 21: Huron 2
## 22: Paradise - Theater 2
## 23: Santa Maria 2
## 24: Sloughhouse 2
## 25: Upland 2
## 26: Weaverville-Courthouse 2
## 27: Willits-125 East Commercial Street 2
## 28: Auburn-Atwood 1
## 29: Carmel Valley 1
## 30: El Rio-Rio Mesa School #2 1
## 31: Laney College 1
## 32: Manteca 1
## 33: Morongo Air Monitoring Station 1
## 34: Oakland West 1
## 35: Redwood City 1
## 36: Rubidoux 1
## 37: Sacramento-Del Paso Manor 1
## 38: Simi Valley-Cochran Street 1
## 39: Thousand Oaks 1
## 40: Tranquillity 1
## Site Name n
When examining the variable of interest specifically (daily mean PM5.2 concentration), there is one observation in the 2004 dataset with a daily mean PM2.5 concentration of -0.1 and one observation with a daily mean PM2.5 concentration of 251. It’s impossible to have a negative or zero concentration, and 251 is over two times higher than the next lowest value, which is 170.4. The 251 ug/m3 value was obtained in Yosemite, and the negative value was obtained in Kaiser Wilderness. There are 282 observations in the 2019 dataset with negative values, across many different sites. The site with the most negative values from 2019 is Tahoe City-Fairway Drive with 153 recorded negative values.
Just to verify that these values don’t significantly affect the data, we can check how many negative or very high values there are.
filter(epa_2004) %>% summarize(negative = mean(epa_2004$`Daily Mean PM2.5 Concentration` < 0, na.rm = TRUE))
## negative
## 1 5.199397e-05
filter(epa_2019) %>% summarize(negative = mean(epa_2019$`Daily Mean PM2.5 Concentration` < 0, na.rm = TRUE))
## negative
## 1 0.00530514
filter(epa_2004) %>% summarize(negative = mean(epa_2004$`Daily Mean PM2.5 Concentration` > 200, na.rm = TRUE))
## negative
## 1 5.199397e-05
The proportion of negative values is very small in the 2004 dataset. Likewise, the proportion of values over 200 is very small in the 2004 dataset. The proportion of negative values is larger, but still less than 1% in the 2019 dataset. There were no unusually high values in the 2019 dataset.
epa_2004 <- epa_2004 %>% mutate(year = 2004)
epa_2019 <- epa_2019 %>% mutate(year = 2019)
epa_2004 <- rename(epa_2004, "daily_mean_pm" = "Daily Mean PM2.5 Concentration")
epa_2019 <- rename(epa_2019, "daily_mean_pm" = "Daily Mean PM2.5 Concentration")
pm_merged <- full_join(epa_2004, epa_2019)
## Joining, by = c("Date", "Source", "Site ID", "POC", "daily_mean_pm", "UNITS",
## "DAILY_AQI_VALUE", "Site Name", "DAILY_OBS_COUNT", "PERCENT_COMPLETE",
## "AQS_PARAMETER_CODE", "AQS_PARAMETER_DESC", "CBSA_CODE", "CBSA_NAME",
## "STATE_CODE", "STATE", "COUNTY_CODE", "COUNTY", "SITE_LATITUDE",
## "SITE_LONGITUDE", "year")
The variable “Daily Mean PM2.5 Concentration” was renamed “daily_mean_pm” to make referencing easier.
nrow(epa_2004) + nrow(epa_2019) == nrow(pm_merged)
## [1] TRUE
The number of rows in the combined dataset does equal the sum of rows in each dataset, so the merge was successful.
We will look at the spatial distribution of the sites by year. A column added to the fully merged dataset denoting if the site was present in only 2004, 2019, or both years will be helpful for this.
library(leaflet)
pm_merged$year_diff <- ifelse(pm_merged$`Site ID` %in% setdiff(epa_2004$`Site ID`, epa_2019$`Site ID`), "2004 only", ifelse(pm_merged$`Site ID` %in% setdiff(epa_2019$`Site ID`, epa_2004$`Site ID`), "2019 only", "both years"))
pal <- colorFactor(palette = c("green", "red", "blue"), domain = pm_merged$year_diff)
years_map = leaflet(pm_merged) %>%
addProviderTiles('CartoDB.Positron') %>%
addCircles(lat=~SITE_LATITUDE,lng=~SITE_LONGITUDE, color = ~pal(year_diff), opacity=1, fillOpacity=1, radius=40) %>% addLegend(position = "topright", pal = pal, values = ~year_diff, title = "Year")
years_map
The points are pretty evenly spread out over the entire state of California with a higher concentration of sites near the coast. However, there are still many sites found in central California and in alpine regions. There are very few sites in the desert region of the state near the borders of Nevada and Arizona. Sites only present in 2004 were not very common in the mountain regions; sites only present in 2019 and sites present in both datasets were more widely distributed throughout the state.
As found in an earlier section, there are negative and high values present in the data for both years, but their proportions within their respective datasets are quite small. Within the merged dataset, we can verify that this observation still holds true.
filter(pm_merged) %>% summarize(negative = mean(pm_merged$daily_mean_pm < 0, na.rm = TRUE))
## negative
## 1 0.003909434
filter(pm_merged) %>% summarize(high = mean(pm_merged$daily_mean_pm > 200, na.rm = TRUE))
## high
## 1 1.381425e-05
We can see that the percentage of negative values in the merged dataset is around 0.39%, and the percentage of very high values in the merged dataset is 0.0014%. These implausible values make up only a very small proportion of the merged dataset.
However, we can still look at the temporal patterns in these values. First, we can look at the year distribution for negative values. From earlier, we know that 1 negative value was present in the 2004 dataset, and 282 were from the 2019 dataset.
pm_neg = pm_merged[pm_merged$daily_mean_pm < 0, ]
pm_neg %>% count(year) %>% mutate(n/sum(n))
## year n n/sum(n)
## 1: 2004 1 0.003533569
## 2: 2019 282 0.996466431
In proportions, 0.35% of the negative values came from 2004 and 99.65% cof the negative values came from 2019. We can do something similar at the sites’ inclusion in either or both datasets.
sum(pm_neg$year_diff == "2004 only")
## [1] 0
sum(pm_neg$year_diff == "2019 only")
## [1] 255
sum(pm_neg$year_diff == "both years")
## [1] 28
pm_neg %>% count(year_diff) %>% mutate(prop = n/sum(n))
## year_diff n prop
## 1: 2019 only 255 0.90106007
## 2: both years 28 0.09893993
Looking at the sites’ inclusion in both years’ datasets, 255 observations came from sites only recorded in 2019, whereas 2 observations came from sites recorded from both years. None came from sites only recorded in 2004. In proportions, 90.11% of the negative values came from sites only recorded in 2019, and 9.89% came from sites recorded in both years.
Now, we can look at the temporal patterns of these implausible values in terms of the month. Since we only have one negative value in the 2004 dataset, we can just print it. Similarly, we can do the same for the one value over 200 in the 2004 dataset.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
pm_merged$year = as.factor(pm_merged$year)
pm_merged$Date = as.Date(pm_merged$Date, "%m/%d/%Y")
pm_merged$Date[pm_merged$year == "2004" & pm_merged$daily_mean_pm < 0]
## [1] "2004-12-08"
pm_merged$Date[pm_merged$year == "2004" & pm_merged$daily_mean_pm > 200]
## [1] "2004-07-18"
The negative value in 2004 occured on December 8, 2004. The very high value occured on July 18, 2004.
However, the 2019 dataset has a lot more negative values than in the 2004 dataset. We can extract those values and look at them more closely.
negative_2019 <- filter(pm_merged, year == "2019") %>%
mutate(negative = daily_mean_pm < 0, date = ymd(Date)) %>%
select(date, negative)
mutate(negative_2019, month = factor(lubridate::month(negative_2019$date)), levels = month) %>%
group_by(month) %>%
summarize(pct.negative = mean(negative, na.rm = TRUE) * 100) %>% arrange(desc(pct.negative))
## # A tibble: 12 × 2
## month pct.negative
## <fct> <dbl>
## 1 3 1.16
## 2 4 0.941
## 3 5 0.636
## 4 2 0.602
## 5 12 0.588
## 6 6 0.493
## 7 9 0.481
## 8 1 0.376
## 9 7 0.345
## 10 8 0.326
## 11 10 0.294
## 12 11 0.113
We can see that all these values are quite small. Most negative values come from March and April, with 1.16% and 0.94% of March and April observations being negative respectively. We can’t be completely sure why they occurred, but since they are a very small proportion of the entire dataset, we can remove them.
pm_merged_filtered = pm_merged[pm_merged$daily_mean_pm > 0 & pm_merged$daily_mean_pm < 200, ]
summary(pm_merged_filtered$daily_mean_pm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.100 4.400 7.300 9.219 11.400 170.400
The new minimum for the merged dataset’s daily mean concentration of PM2.5 is 0.1 ug/m3. The new maximum is 170.4 ug/m3.
First, let’s take a look at PM2.5 levels across the entire state. The data was cleaned so that only sites present in both datasets are used for data visualization; this is see the difference between both years for sites that have data recorded in both 2004 and 2019.
pm_merged_filtered$year = as.factor(pm_merged_filtered$year)
pm_merged_filtered$Date = as.Date(pm_merged_filtered$Date, "%m/%d/%Y")
both_years = pm_merged_filtered[pm_merged_filtered$year_diff == "both years", ]
both_years %>% ggplot(mapping = aes(x = year, y = daily_mean_pm)) + geom_boxplot() + xlab("Year") + ylab("Daily Mean PM2.5 in ug/m3")
data_2004 = both_years[both_years$year == "2004", ]
data_2019 = both_years[both_years$year == "2019", ]
data_2004 %>% ggplot(mapping = aes(x = daily_mean_pm)) + geom_histogram(binwidth = 3) + xlab("Daily Mean PM2.5 in ug/m3") + ylab("Frequency") + ggtitle("Histogram of Daily Mean PM2.5 in 2004")
data_2019 %>% ggplot(mapping = aes(x = daily_mean_pm)) + geom_histogram(binwidth = 3) + xlab("Daily Mean PM2.5 in ug/m3") + ylab("Frequency") + ggtitle("Histogram of Daily Mean PM2.5 in 2019")
From the boxplot, we can see visually that the mean of the daily mean of PM2.5 across the entire state in 2004 is higher than 2019. The first quartile for the daily mean of PM2.5 across the state in 2004 is very similar to the mean of 2019, whereas the third quartile for the daily mean of PM2.5 across the state in 2019 is very similar to the mean of 2004. Furthermore, there is less variation in recorded PM2.5 concentrations in 2019 than in 2004.
From the histogram, looking at the y axis limits on the 2004 dataset versus the 2019 datset, the range for the 2004 dataset is much smaller with the upper bound of the y axis at around 2500 counts whereas the 2019 dataset has an upper bound of over 10,000 counts. This suggests that the daily PM2.5 values in 2004 have more counts per observed value; in other words, more values appear more times in the 2004 dataset compared to the 2019 dataset.
Next, looking at the x-axis limits, the x-axis in the 2004 data has a much higher upper bound than the 2019 data; the upper bound for the 2004 data goes past 150, whereas the upper bound for the 2019 data goes a little past 120. The difference in x-axis and y-axis limits validate the boxplots in that there is more variation in the 2004 dataset, which includes higher values of PM2.5 concentration compared to the 2019 dataset. Ultimately, this observation implies that the 2019 dataset has many less high PM2.5 concentration values compared to 2004.
Now, let’s take a look at PM2.5 levels at a county level. We first have to clean out the counties that are present in only 2004 or 2019.
county_level <- group_by(both_years, COUNTY, year) %>% summarize(PM = mean(daily_mean_pm, na.rm = TRUE))
## `summarise()` has grouped output by 'COUNTY'. You can override using the
## `.groups` argument.
head(county_level)
## # A tibble: 6 × 3
## # Groups: COUNTY [3]
## COUNTY year PM
## <chr> <fct> <dbl>
## 1 Alameda 2004 11.3
## 2 Alameda 2019 6.37
## 3 Butte 2004 8.32
## 4 Butte 2019 10.8
## 5 Calaveras 2004 7.61
## 6 Calaveras 2019 5.46
tail(county_level)
## # A tibble: 6 × 3
## # Groups: COUNTY [3]
## COUNTY year PM
## <chr> <fct> <dbl>
## 1 Tulare 2004 16.6
## 2 Tulare 2019 11.3
## 3 Ventura 2004 11.2
## 4 Ventura 2019 7.09
## 5 Yolo 2004 9.66
## 6 Yolo 2019 6.85
From just looking at the first 6 rows and the last 6 rows, we can see that for the first 3 and last 3 counties, most daily man PM values have gone down from 2004 to 2019 except for Butte County, where the mean daily PM2.5 has gone up. We can visualize this for all counties using a graph.
qplot(year, PM, data = mutate(county_level, year = as.numeric(as.character(year))),
color = factor(COUNTY),
geom = c("point", "line"))
Most counties have decreased their PM2.5 levels except for a few. We can check for which counties this has occurred.
county_unique = as.vector(unique(county_level$COUNTY))
county_means_2004 = as.vector(county_level$PM[county_level$year == "2004"])
county_means_2019 = as.vector(county_level$PM[county_level$year == "2019"])
diff = county_means_2004 - county_means_2019
diff
## [1] 4.8897036 -2.4690631 2.1456106 3.1016234 6.0369257 0.7704325
## [7] 0.8730541 4.4211578 1.4647938 0.1846959 8.8770818 7.5492794
## [13] 6.8686997 1.1310256 4.7087828 0.8657961 7.6266667 -2.6738296
## [19] 2.5079198 0.6955684 7.3028218 7.3715532 0.7663162 8.5593959
## [25] 5.6435392 0.9705365 5.8685829 3.9522810 3.2612407 3.9700495
## [31] 3.9740311 1.4087500 2.6995841 0.2547653 0.5277335 0.3512759
## [37] 3.8518149 5.8367018 3.4332922 0.7919269 5.3542257 4.1039754
## [43] 2.8138116
The diff vector contains all the differences between the mean daily mean PM2.5 in each county from 2004 to 2019. Counties with lower PM2.5 in 2019 will have positive differences; counties with higher PM2.5 in 2019 will have negative differences. We can grab the counties with a negative difference.
county_unique[c(diff < 0)]
## [1] "Butte" "Mono"
We can see that Butte and Mono Counties have had higher PM2.5 levels in 2019 than in 2004. However, for the remaining counties, PM2.5 levels in 2019 have been reduced from 2004. We can find the proportion of counties with increases among all counties examined in both 2004 to 2019.
county_increase = county_unique[c(diff < 0)]
length(county_increase)/length(county_unique)
## [1] 0.04651163
Counties with increases in PM2.5 concentration constitute only 4.65% of all counties. For the majority of counties, on a county-level, PM2.5 concentration has decreased over the last 15 years in California.
Finally, we can examine this at a site level.
site_level <- group_by(both_years, `Site ID`, year) %>% summarize(PM = mean(daily_mean_pm, na.rm = TRUE))
## `summarise()` has grouped output by 'Site ID'. You can override using the
## `.groups` argument.
head(site_level)
## # A tibble: 6 × 3
## # Groups: Site ID [3]
## `Site ID` year PM
## <int> <fct> <dbl>
## 1 60010007 2004 11.3
## 2 60010007 2019 6.37
## 3 60074001 2004 8.32
## 4 60074001 2019 10.8
## 5 60090001 2004 7.61
## 6 60090001 2019 5.46
tail(site_level)
## # A tibble: 6 × 3
## # Groups: Site ID [3]
## `Site ID` year PM
## <int> <fct> <dbl>
## 1 61113001 2004 11.3
## 2 61113001 2019 6.59
## 3 61130004 2004 9.46
## 4 61130004 2019 6.72
## 5 61131003 2004 10.3
## 6 61131003 2019 7.72
From just looking at the first 6 rows and the last 6 rows, we can see that for the first 3 and last 3 sites, most daily man PM values have gone down from 2004 to 2019 except for site 60074001, where the mean daily PM2.5 has gone up. We can visualize this for all sites again using a graph.
qplot(year, PM, data = mutate(site_level, year = as.numeric(as.character(year))),
color = factor(`Site ID`),
geom = c("point", "line"))
Like the county-level map, we can see that most sites saw a decrease in their PM2.5 levels from 2004 to 2019, but there area few which saw increases. We can also extract this using a similar method as for the site level.
site_unique = as.vector(unique(site_level$`Site ID`))
site_means_2004 = as.vector(site_level$PM[site_level$year == "2004"])
site_means_2019 = as.vector(site_level$PM[site_level$year == "2019"])
site_diff = site_means_2004 - site_means_2019
site_unique[c(site_diff < 0)]
## [1] 60074001 60290011 60510001 60570005
We can see that 4 sites saw increases in PM2.5. We can check their county to see if any of them come from the two counties we extrapolated earlier.
site_increases = site_unique[c(site_diff < 0)]
unique(both_years$COUNTY[both_years$`Site ID` %in% site_increases])
## [1] "Butte" "Kern" "Mono" "Nevada"
length(site_increases)/length(site_unique)
## [1] 0.05128205
The 4 sites which saw increases were in Butte, Kern, Mono, and Nevada counties. Butte and Mono were two counties that did see a county-level wide increase of PM2.5 from 2004 to 2019. However, sites where PM2.5 concentration increased from 2004 to 2019 represent only 5.13% of all sites present in both datasets. For the majority of sites, on a site-level, PM2.5 concentration has decreased over the last 15 years in California.